import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.offline import init_notebook_mode, iplot, plot
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.cluster import KMeans, AgglomerativeClustering
Mall_Customers = pd.read_csv("Mall_Customers.csv")
Mall_Customers.head()
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | |
|---|---|---|---|---|---|
| 0 | 1 | Male | 19 | 15 | 39 |
| 1 | 2 | Male | 21 | 15 | 81 |
| 2 | 3 | Female | 20 | 16 | 6 |
| 3 | 4 | Female | 23 | 16 | 77 |
| 4 | 5 | Female | 31 | 17 | 40 |
# Rename columns
Mall_Customers.rename(columns={'Annual Income (k$)':'Annual_Income','Spending Score (1-100)':'Spending_Score'},inplace=True)
Mall_Customers.head()
| CustomerID | Gender | Age | Annual_Income | Spending_Score | |
|---|---|---|---|---|---|
| 0 | 1 | Male | 19 | 15 | 39 |
| 1 | 2 | Male | 21 | 15 | 81 |
| 2 | 3 | Female | 20 | 16 | 6 |
| 3 | 4 | Female | 23 | 16 | 77 |
| 4 | 5 | Female | 31 | 17 | 40 |
Mall_Customers.shape
(200, 5)
Mall_Customers.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 200 entries, 0 to 199 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 200 non-null int64 1 Gender 200 non-null object 2 Age 200 non-null int64 3 Annual_Income 200 non-null int64 4 Spending_Score 200 non-null int64 dtypes: int64(4), object(1) memory usage: 7.9+ KB
Mall_Customers.isna().sum()
CustomerID 0 Gender 0 Age 0 Annual_Income 0 Spending_Score 0 dtype: int64
Mall_Customers.describe()
| CustomerID | Age | Annual_Income | Spending_Score | |
|---|---|---|---|---|
| count | 200.000000 | 200.000000 | 200.000000 | 200.000000 |
| mean | 100.500000 | 38.850000 | 60.560000 | 50.200000 |
| std | 57.879185 | 13.969007 | 26.264721 | 25.823522 |
| min | 1.000000 | 18.000000 | 15.000000 | 1.000000 |
| 25% | 50.750000 | 28.750000 | 41.500000 | 34.750000 |
| 50% | 100.500000 | 36.000000 | 61.500000 | 50.000000 |
| 75% | 150.250000 | 49.000000 | 78.000000 | 73.000000 |
| max | 200.000000 | 70.000000 | 137.000000 | 99.000000 |
mask = np.zeros_like(Mall_Customers.corr())
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize = (26,16))
sns.heatmap(Mall_Customers.corr(), mask=mask, annot=True, cmap="RdYlGn", linewidths=.75)
<AxesSubplot:>
px.scatter(Mall_Customers, x = "Age", y = "Spending_Score", color = "Annual_Income", template = "seaborn", opacity = 1)
fig = px.histogram(Mall_Customers, x = "Age", y = "Spending_Score", marginal = "violin", color = "Gender", template = "seaborn",
text_auto = True, width = None, height = None, nbins = 100, hover_data = Mall_Customers.columns)
fig.show()
fig = px.histogram(Mall_Customers, x = "Age", y = "Annual_Income", marginal = "box", color = "Gender", template = "seaborn",
text_auto = True, width = None, height = None, nbins = 100, hover_data = Mall_Customers.columns)
fig.show()
fig = px.histogram(Mall_Customers, x = "Annual_Income", y = "Spending_Score", marginal = "rug", color = "Gender", template = "seaborn",
text_auto = True, width = None, height = None, nbins = 100, hover_data = Mall_Customers.columns)
fig.show()
# get the numerical features (all of our features here are numerical, so we don't *really* need this)
# also note, if you have features that aren't numerical but you still want to use them, you can convert them to numbers
numerical_feats = list(Mall_Customers._get_numeric_data().columns)
dim_df = Mall_Customers[numerical_feats]
# standardize values
x = dim_df.values
x = StandardScaler().fit_transform(x)
# apply PCA
pca = PCA(n_components = 2, random_state = 7) # reduce to 2 dimensions
pca_mdl = pca.fit_transform(x)
#pca_mdl
pca_mdl.shape
(200, 2)
# convert PCA array output to dataframe
pca_df = pd.DataFrame(pca_mdl)
pca_df.head()
| 0 | 1 | |
|---|---|---|
| 0 | -2.403924 | -0.816423 |
| 1 | -2.348416 | -1.866763 |
| 2 | -2.393558 | 0.142008 |
| 3 | -2.307126 | -1.653214 |
| 4 | -2.332853 | -0.230933 |
When plotting this along with the target values, it is useful to see that they appear to be separable and in individual clusters.
sns.scatterplot(x = pca_df[0], y = pca_df[1], hue=Mall_Customers.Gender)
<AxesSubplot:xlabel='0', ylabel='1'>
sns.scatterplot(x = pca_df[0], y = pca_df[1], hue=Mall_Customers.Annual_Income)
<AxesSubplot:xlabel='0', ylabel='1'>
sns.scatterplot(x = pca_df[0], y = pca_df[1], hue=Mall_Customers.Spending_Score)
<AxesSubplot:xlabel='0', ylabel='1'>
We will begin our modeling with K-Means Clustering.
The K-Means algorithm works as follows:
Remember how we determine the best number of clusters (if we can't just manually look at it and decide)?
We look at the variance -- or, the sum of squared distances between the observations and their centroids. Note: "inertia" is the "within-cluster sum-of-squares criterion." See scikit learn documentation.
import scipy.cluster.hierarchy as sch
plt.figure(figsize=(25,12))
dendrogram=sch.dendrogram(sch.linkage(x,method = 'ward'))
plt.title('Dendrogram plot')
plt.show()
inertia = []
for k in range(1,11):
kmeans = KMeans(n_clusters=k, random_state=1).fit(x)
inertia.append(np.sqrt(kmeans.inertia_))
Here, we see that the variance decreases significantly until 2, and then starts to decrease at a slower rate afterwards. Therefore, 2 is our preferred number of clusters.
plt.plot(range(1, 11), inertia, marker='s');
plt.xlabel('$k$')
plt.ylabel('Variance')
Text(0, 0.5, 'Variance')
The elbow method runs k-means clustering on the dataset for a range of values of k (say 1 to 10).
#create KMeans model
kmeans = KMeans(n_clusters=5, random_state=1).fit(x)
Now that we have fit our k-means clusters, let's just find what value (0-4, since we have set K=5) each row of data is so we can visualize it.
y = kmeans.fit_predict(x)
We are reusing the PCA (dimensionality reduction) data frame for the sake of visualizing 2-dimensional data (rather than 5).
sns.scatterplot(x = pca_df[0], y = pca_df[1], hue=y)
<AxesSubplot:xlabel='0', ylabel='1'>
We could also try plotting individual features to take a look.
sns.scatterplot(x = Mall_Customers['Annual_Income'], y = Mall_Customers['Spending_Score'], hue=y)
<AxesSubplot:xlabel='Annual_Income', ylabel='Spending_Score'>
sns.scatterplot(x = Mall_Customers['Age'], y = Mall_Customers['Spending_Score'], hue=y)
<AxesSubplot:xlabel='Age', ylabel='Spending_Score'>
sns.scatterplot(x = Mall_Customers['Age'], y = Mall_Customers['Annual_Income'], hue=y)
<AxesSubplot:xlabel='Age', ylabel='Annual_Income'>
Let's add our clusters back to the original DataFrame so we can take a look at some of the items.
y_df = pd.DataFrame(y, columns=['Cluster'])
new_df = pd.concat([Mall_Customers, y_df], axis=1)
new_df
| CustomerID | Gender | Age | Annual_Income | Spending_Score | Cluster | |
|---|---|---|---|---|---|---|
| 0 | 1 | Male | 19 | 15 | 39 | 0 |
| 1 | 2 | Male | 21 | 15 | 81 | 0 |
| 2 | 3 | Female | 20 | 16 | 6 | 1 |
| 3 | 4 | Female | 23 | 16 | 77 | 0 |
| 4 | 5 | Female | 31 | 17 | 40 | 1 |
| ... | ... | ... | ... | ... | ... | ... |
| 195 | 196 | Female | 35 | 120 | 79 | 3 |
| 196 | 197 | Female | 45 | 126 | 28 | 2 |
| 197 | 198 | Male | 32 | 126 | 74 | 3 |
| 198 | 199 | Male | 32 | 137 | 18 | 2 |
| 199 | 200 | Male | 30 | 137 | 83 | 3 |
200 rows × 6 columns
fig = px.scatter(pd.concat([new_df, pca_df], axis = 1),
x = 0, y = 1, color='Cluster', hover_data=['Gender','Age','Annual_Income','Spending_Score'])
fig.show()
Let's try agglomerative clustering with the same dataset as what we did above to see how it differs. But first, can you give a brief description of Agglomerative Clustering?
plt.figure(figsize=(25, 15))
plt.title('Dendrogram')
plt.xlabel('Spending Score')
plt.ylabel('Euclidean distances')
plt.axhline(y=5, color='r', linestyle='--')
plt.axhline(y=16, color='r', linestyle='--')
dend = sch.dendrogram(sch.linkage(x, method='ward'))
#Implement model
agglo = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
Now lets fit and create some prediction to visualize the clusters!
y_agglo = agglo.fit_predict(x)
Now let's visualize! We will once again be using PCA to do so.
sns.scatterplot(x = pca_df[0], y = pca_df[1], hue=y_agglo)
<AxesSubplot:xlabel='0', ylabel='1'>
Now lets look again at K-Means visual again to compare
sns.scatterplot(x = pca_df[0], y = pca_df[1], hue=y)
<AxesSubplot:xlabel='0', ylabel='1'>
Lets also again look at some seperate features. Will be again looking at attack and defense just as we did with K-means!
sns.scatterplot(x = Mall_Customers['Annual_Income'], y = Mall_Customers['Spending_Score'], hue=y_agglo)
<AxesSubplot:xlabel='Annual_Income', ylabel='Spending_Score'>
Once again, pulling up the K-means visual for quick comparison.
sns.scatterplot(x = Mall_Customers['Annual_Income'], y = Mall_Customers['Spending_Score'], hue=y)
<AxesSubplot:xlabel='Annual_Income', ylabel='Spending_Score'>
Both the K-means and Agglomerative clustering appear identical.
Lets make an interactive scatterplot again! Remember to note that the x- and y-axis are our PCA values (from dimensionality reduction). Below, we concat the dataframe along with the PCA values so that we can visualize properly.
y_a_df = pd.DataFrame(y_agglo, columns=['Cluster (Agglomerative)'])
new_a_df = pd.concat([Mall_Customers, y_a_df], axis=1)
fig = px.scatter(pd.concat([new_a_df, pca_df], axis = 1),
x = 0, y = 1, color='Cluster (Agglomerative)', hover_data=['Gender','Age','Annual_Income','Spending_Score'])
fig.show()